# ----------------------------------------------
# Here, we take MasterDataDec2019, which is the raw July 2018 data (for all munis except for
# split ones), plus the political support and per capita fiscal variables used in the CPS paper.
# We add two types of data to this.
# The notes.doc file in this folder discusses the data in more detail.
# ----------------------------------------------







# --------------------------------------
# (1) ADDING FISCAL AND DEMOGRAPHIC VARIABLES FOR 2013, 2014, AND 2015:
# We collected data for 2013, 2014, and 2015 for the universe of Japanese municipalities.  The notes.doc
# document explains where the data came from.  Lets merge these into the Master Data.
# -------------------------------------------

# Notes:

# Split municipalities are not included in the Master Data, so while we collected the 2013-15 data for 
# these municipalities, when we merge with the MasterData, they will drop out.  
# We can always change the Stata code in the Organize do file to keep split municipalities in the dataset
# before saving the new variables made out, or we could manually add them back to MasterData.
# NB: the Organize file is run on "out_final_utf8 in ~TARGETING\replication2018\ANALYSIS OF MASTER DATA\CPS_support_variables_on_July2018data

# The 2015 census data doesn't contain data on municipalities that ceased to exist between 2010 and 2015, so
# these variables will be blank for municipalities in 2013 and 2014, even if they existed.







# -------------------------------------
# (a) Read in the new variables that exist in separate files and create a merged file:

setwd("C:/Users/ac6037/Dropbox/TARGETING/replication2018/ANALYSIS OF MASTER DATA/AddingData_to_MasterDataDec2019")

ceif_area <- read.csv("final_mun_ceif_areasize.csv")
primary <- read.csv("final_mun_employment_primary.csv")
pop_under <- read.csv("final_mun_pop_under.csv")
mun_pop <- read.csv("final_mun_population.csv")
ngaid <- read.csv("final_mun_ngaid.csv")
inc <- read.csv("final_mun_taxable_income.csv")

m1 <- merge(primary, pop_under, all=T) # merged perfectly
m2 <- merge(ngaid, ceif_area, all=T) # merged perfectly
rm(primary, pop_under, ngaid, ceif_area)
m3 <- merge(m2, mun_pop, all=T)
rm(m2, mun_pop)
m4 <- merge(m3, inc, all=T)
m5 <- merge(m1, m4, all=T)
# all 8 variables in m5

# Make a year_code variable and make sure there are no duplicates
m5$year_code <- paste(m5$year, m5$code, sep="_")
length(m5$year_code) - length(unique(m5$year_code))
# 38 duplicates exist

# read it out 
# write.csv(m5, "variables_13-15.csv")
# I opened in Excel and manually checked and fixed all the duplicates

# Brief notes on corrections.
# In MasterData, 2012 and 2013, ���Β� receives code 3301.  In taxable income, it changes to 3302 in 2014 and 2015.
# Lets change ���Β� to 3301 in 2014 and 2015.
# In Masterdata, ��� until 2012 gets 3305, and in 2014, ���s appears under 3216.  The appropriate code is 3216.
# In MasterData, ������ is 3302.  ni taxable income, its 3303, so change it back.
# ��蒬 should be 3303 in all years.

# ALSO: I fixed the municipalities that had "-" in some variables (I changed this to NA), and deleted
# the values for "���ʋ敔".

# Corrected file is "corrected_variables_13-15.csv".









# ---------------------------------------------------------
# (b) Add the "corrected_variables_13-15.csv" back into the Master Data 

setwd("C:/Users/ac6037/Dropbox/TARGETING/replication2018/ANALYSIS OF MASTER DATA/AddingData_to_MasterDataDec2019")

# First, we read in the master dataset and separate out the 2014 data
# Second, we take the new variables in and create the per capita and logged variables needed for those years
# Third, we merge these new variables onto the 2014 data, creating a dataset comprised of 2013, 2014, and 2015 data.
# Fourth, we attach this data (2013, 2014, and 2015) back onto the Master Data

# Create two MasterDatas: one running until 2012 and the other comprised of 2014 elections data
# (which we later attach):

mas <- read.csv("MasterDataDec2019.csv")
masall <- mas[mas$year!=2014,]
mas14 <- mas[mas$year==2014,]

# Take the 2012 data, make sure "code" == "muncode_num" 
# (It does, except for code==3305, 11445, 12402 have NA muncode_nums and for mun_name_needs.
# This means they don't have NEEDs data for 2012).
# This confirms that "code" values in 2014 can be used as "muncode_num" values.
# Take our 2014 data (none of which have mun_name_needs or muncode_num) 
# and write over the existing blank "muncode_num" values with "code" values.

mas14$muncode_num <- mas14$code
# The only variables we have for mas14 are electoral ones.  We can use year and muncode_num 
# to merge in the new variables.

# Read in new variables for 2013, 2014, and 2015
dat <- read.csv("corrected_variables_13-15.csv")
dat$mun_area_size <- dat$area_size
dat$muncode_num <- dat$code
# trim dat
dat1 <- dat[, -c(2, 9, 12)]
# dat1 has year, muncode_num, and mun_name_needs, plus the eight new variables.

# Lets create the per capita variables now, so that when we merge the 2013-15 data into the master data,
# we keep the original variables made in Stata for the non-2013/14/15 period:

dat1$mun_needy <- dat1$mun_pop_14_and_under + dat1$mun_pop_65_and_over
dat1$needy_pc <- dat1$mun_needy/dat1$mun_population
dat1$primary_pc <- dat1$mun_employment_primary/dat1$mun_population
dat1$income_pc <- dat1$mun_taxable_income/dat1$mun_population
dat1$logincome_pc <- log(dat1$income_pc)
dat1$mun_population_density <- dat1$mun_population/dat1$mun_area_size
dat1$mun_population_density <- dat1$mun_population_density/1000
dat1$lnpop <- log(dat1$mun_population)
dat1$ngaid_pc <- dat1$mun_ngaid/dat1$mun_population
dat1$logngaid_pc <- log(dat1$ngaid_pc)

# Take mas14, and delete all the columns that we're going to add from the new dataset (dat1)
# and "code" (we've already copied this to muncode_num) and "mun_name_needs" (which is all NA):

mas14$mun_employment_primary <- NULL
mas14$mun_pop_14_and_under <- NULL
mas14$mun_pop_65_and_over <- NULL
mas14$mun_ngaid <- NULL
mas14$mun_ceif <- NULL
mas14$mun_population <- NULL
mas14$mun_taxable_income <- NULL
mas14$mun_area_size <- NULL
mas14$code <- NULL
mas14$mun_name_needs <- NULL
mas14$mun_needy <- NULL
mas14$needy_pc <- NULL
mas14$primary_pc <- NULL
mas14$income_pc <- NULL
mas14$logincome_pc <- NULL
mas14$mun_population_density <- NULL
mas14$lnpop <- NULL
mas14$ngaid_pc <- NULL
mas14$logngaid_pc <- NULL

# Now, mas14 only has "year" and "muncode_num" in common with dat1 (the new data)
n <- colnames(mas14) %in% colnames(dat1)
table(n)

# Merge datasets on the columns they have in common (year and muncode_num):
mer <- merge(mas14, dat1, all=T)
write.csv(mer, "13-14-15.csv", row.names = F)

# This looks good: the resulting dataset has the same number of rows as the 13,14,15 dataset.
# We checked this, and in 2013 and 2015, data is populated only for the 18 variables

# mun_name_needs	mun_employment_primary	mun_pop_14_and_under	mun_pop_65_and_over	
# mun_ngaid	mun_ceif	mun_population	mun_taxable_income	mun_area_size	mun_needy	needy_pc	
# primary_pc	income_pc	logincome_pc	mun_population_density	lnpop	ngaid_pc	logngaid_pc


# Add back the "code" column so that we can merge this back into the masterdataset:
mer$code <- mer$muncode_num

# lets make sure all column names in mer are in the master data (excluding 2014):
n <- colnames(masall) %in% colnames(mer)
table(n)

# Lets rbind mer (data for 2013, 2014 and 2015) onto masall (data until 2012)
mer2 <- rbind(masall, mer) 
# Columns have to be the same, but not in the same order.  
# This keeps column order of masall

# We can read this out and eyeball coherence between, in 2014, mun_name_needs (which is from the
# new data) and mun_name_jed1 and mun_name_jed2 (use checking~csv document.  Checked all of them and all
# look fine):

write.csv(mer2, "Masterplus.csv", row.names = F)
# dim 105353 x 434 variables

rm(dat, dat1, mas, mas14, masall, mer)

# -------------------------------------------------------------------------------






# --------------------------------------
# (2) SNOW DATA

# -------------------------------------------

rm(list=ls())

setwd("C:/Users/ac6037/Dropbox/TARGETING/replication2018/ANALYSIS OF MASTER DATA/AddingData_to_MasterDataDec2019")

dat <- read.csv("Masterplus.csv")
# latest version of dataset, plus 2013, 2014, and 2015 data merged in

# Create a unique identifier for each column, which we use to merge data back in later:
dat$tomerge <- rownames(dat)

edat <- dat[!is.na(dat$mun_name2_jed),]
table(edat$year)
# 1980 1983 1986 1989 1993 1996 2000 2003 2005 2009 2012 2014 
# 3357 3361 3360 3357 3367 3354 3352 3303 2357 1859 1804 1812 

# Load snow data (final version, relies on earlier version of NEEDs data):
# See the R scripts in the Taishi Analysis folder and Muraoka data.R in my initial ANALYSIS~R script 
# (data is the JED data merged with snow data; then merged with the NEEDs data) for a description of
# all of the variables.

load("JEDSnow_Feb18.RData")

# This is elections data, meaning it is JED-populated data, matched to NEEDs data, 1980-2014.
sum(is.na(needs_elec2$mun_name2_jed))
#0
sum(is.na(needs_elec2$HOR_electoral_district))
#0
sum(is.na(needs_elec2$muncode_num)) 
# 1084 (when muncode_num is blank, this means we don't have NEEDs data for the observation)

# Check that municipalities with muncode_num==NA are missing NEEDs data
n <- needs_elec2[is.na(needs_elec2$muncode_num),]
sum(is.na(n$mun_name_needs))
# 1084

# Also, it contains split municipalities, which are out of the Masterplus
table(needs_elec2$split_mun_jed)

# Because these are out of Masterplus, lets create needs_elec3 which 
# excludes split municipalities, so as to match the Masterplus data:
needs_elec3 <- needs_elec2[needs_elec2$split_mun_jed==0,]

# Also, the 1990 election is coded as being in 1990 here, whereas it is coded as being in 1989 in edat
table(needs_elec3$year)
# Lets recode year==1989
needs_elec3$year[needs_elec3$year==1990] <- 1989
table(needs_elec3$year) # recoded

# Every year, the snow data has a handful more observations than the master data, which we can assume is the 
# corrected data.  Lets make sure that all the municipalities in the MasterData are in the snow data, so we can take the
# varibles we need:

edat$compare <- paste(edat$year, edat$hor_electoral_district, edat$mun_name1_jed, edat$mun_name2_jed, sep="_")
length(edat$compare) - length(unique(edat$compare)) #

needs_elec3$compare <- paste(needs_elec3$year, needs_elec3$HOR_electoral_district, needs_elec3$mun_name1_jed, needs_elec3$mun_name2_jed, sep="_")
length(needs_elec3$compare) - length(unique(needs_elec3$compare)) #

n <- edat$compare %in% needs_elec3$compare
table(n) # all TRUE

# Lets reduce the snow data down to the variables we need (snow-related), plus year, hor_electoral_district
# and mun_name2_jed:

# Reduce dataset down to variables we need, plus identifier variables
newv <- needs_elec3[, c("compare",
                        "prefecture",
                        "prefecture_jpn",
                        "election_date",
                        "snow_treat", 
                        "snow_distance", 
                        "snow_mixdistrict")] 

# Take Master Data and attach the snow variables
mer <- merge(edat, newv, all=F)
# new variables are attached on the end

# Now, lets merge mer back into MasterData using id
# reduce dataset to new vars:

new.mer <- mer[, c("tomerge",
                   "prefecture",
                   "prefecture_jpn",
                   "election_date",
                   "snow_treat", 
                   "snow_distance", 
                   "snow_mixdistrict")] 

final.mer <- merge(dat, new.mer, all=T)
final.mer$tomerge <- NULL

# This is the Master Data, plus 13, 14, and 15 data (called Masterplus), plus the snow variables:
# NB: new snow variables only populated for election years

write.csv(final.mer, file = "Master_plus_Snow.csv", row.names=F)
